import re
from bs4 import BeautifulSoup #网页解析，获取数据
from urllib import request,error #制定URL，获取网页数据
findtitle = re.compile(r'(第.+)<')
findtext = re.compile(r'(.+)<br/>')

def askURL(url):   #得到一个指定的URL的网页内容
    headers = {    # 模拟浏览器头部信息
        'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
    }
    req = request.Request(url=url, headers=headers)    # 构造请求信息
    html = ''
    try:    # 尝试发送请求并获取响应信息
        response = request.urlopen(req, timeout=3)
        html = response.read().decode('gbk')
        # print(html)
    except Exception as err:
        print('发生错误：', err)    # 输出错误信息
    return html

def getdata(html, textt):    # 获取数据
    soup = BeautifulSoup(html, 'html.parser')    # 逐一解析网页
    title = soup.find('strong')
    # print(title)
    data = re.findall(findtitle,str(title))
    # textt.write(data[0])
    texts = soup.find('div', class_="mainContenr")
    # print(str(texts))
    data = re.findall(findtext, str(texts))
    for i in data:
        res = i.replace('\xa0','  ')
        print(res)
        textt.write(res+'\n')

if __name__ == '__main__':
    
    url = 'http://www.quanshuwang.com/book/44/44683/15379610.html'
    html = askURL(url)
    # print(type(html))
    textt = open('textt.txt', 'w', encoding='utf-8')
    getdata(html, textt)
    textt.close()





